Preparación del entorno de trabajo y carga de librerías

# Library loading
rm(list = ls())

suppressPackageStartupMessages({
  library(data.table)
  library(dplyr)
  library(caret)
  library(scales)
  library(ggplot2)
  library(stringi)
  library(stringr)
  library(dataPreparation)
  library(knitr)
  library(kableExtra)
  library(ggpubr)
  library(tictoc)
  library(ggeasy)
  library(lubridate)
  library(inspectdf)
  library(ranger)
  library(gbm)
  library(MLmetrics)
})

1 Introducción al problema

Utilizando los datos provistos por el Ministerio del agua de Tanzania, se requiere construir un modelo que sea capaz de predecir cuales bombas de agua están operativas, operativas pero que necesitan reparación o están dañadas, basadas en un set de datos train.

En el primer modelo solo se realizaron pruebas con las variables numéricas, obteniendo un scoring aceptable pero mejorable. En el segundo modelo se consiguío una mejora sustantiva y ahora en este tercer modelo se verá si se puede optimizar un poco más.

2 Exploracion inicial al conjunto de datos

De forma original se incluye los dataset train y test mas el archivo con las labels.

# Data Loading
#Fichero datos train
vtrain <- fread("train_set.csv")
vtrain$flag <- 1 # Columna que indica si es parte del set train (1) test (0)

#Fichero datos test
vtest <- fread("test_set.csv")
vtest$flag <- 0 # Columna que indica si es parte del set train (1) test (0)

#Fichero con labels o objetivo
vlabels <-fread("labels.csv")

# Se unen las labels con el set de datos train

train <- merge(vlabels, vtrain)

# Se unen ambos datasets (train y test) lo cual es lo recomendado para mas adelante trabajar en Feature engineering
datos <- as.data.table(rbind(vtrain, vtest))
#Comprobación
head(datos)
head(vlabels)
#Distribución de los datos
str(datos)
## Classes 'data.table' and 'data.frame':   74250 obs. of  41 variables:
##  $ id                   : int  69572 8776 34310 67743 19728 9944 19816 54551 53934 46144 ...
##  $ amount_tsh           : num  6000 0 25 0 0 20 0 0 0 0 ...
##  $ date_recorded        : IDate, format: "2011-03-14" "2013-03-06" ...
##  $ funder               : chr  "Roman" "Grumeti" "Lottery Club" "Unicef" ...
##  $ gps_height           : int  1390 1399 686 263 0 0 0 0 0 0 ...
##  $ installer            : chr  "Roman" "GRUMETI" "World vision" "UNICEF" ...
##  $ longitude            : num  34.9 34.7 37.5 38.5 31.1 ...
##  $ latitude             : num  -9.86 -2.15 -3.82 -11.16 -1.83 ...
##  $ wpt_name             : chr  "none" "Zahanati" "Kwa Mahundi" "Zahanati Ya Nanyumbu" ...
##  $ num_private          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ basin                : chr  "Lake Nyasa" "Lake Victoria" "Pangani" "Ruvuma / Southern Coast" ...
##  $ subvillage           : chr  "Mnyusi B" "Nyamara" "Majengo" "Mahakamani" ...
##  $ region               : chr  "Iringa" "Mara" "Manyara" "Mtwara" ...
##  $ region_code          : int  11 20 21 90 18 4 17 17 14 18 ...
##  $ district_code        : int  5 2 4 63 1 8 3 3 6 1 ...
##  $ lga                  : chr  "Ludewa" "Serengeti" "Simanjiro" "Nanyumbu" ...
##  $ ward                 : chr  "Mundindi" "Natta" "Ngorika" "Nanyumbu" ...
##  $ population           : int  109 280 250 58 0 1 0 0 0 0 ...
##  $ public_meeting       : logi  TRUE NA TRUE TRUE TRUE TRUE ...
##  $ recorded_by          : chr  "GeoData Consultants Ltd" "GeoData Consultants Ltd" "GeoData Consultants Ltd" "GeoData Consultants Ltd" ...
##  $ scheme_management    : chr  "VWC" "Other" "VWC" "VWC" ...
##  $ scheme_name          : chr  "Roman" "" "Nyumba ya mungu pipe scheme" "" ...
##  $ permit               : logi  FALSE TRUE TRUE TRUE TRUE TRUE ...
##  $ construction_year    : int  1999 2010 2009 1986 0 2009 0 0 0 0 ...
##  $ extraction_type      : chr  "gravity" "gravity" "gravity" "submersible" ...
##  $ extraction_type_group: chr  "gravity" "gravity" "gravity" "submersible" ...
##  $ extraction_type_class: chr  "gravity" "gravity" "gravity" "submersible" ...
##  $ management           : chr  "vwc" "wug" "vwc" "vwc" ...
##  $ management_group     : chr  "user-group" "user-group" "user-group" "user-group" ...
##  $ payment              : chr  "pay annually" "never pay" "pay per bucket" "never pay" ...
##  $ payment_type         : chr  "annually" "never pay" "per bucket" "never pay" ...
##  $ water_quality        : chr  "soft" "soft" "soft" "soft" ...
##  $ quality_group        : chr  "good" "good" "good" "good" ...
##  $ quantity             : chr  "enough" "insufficient" "enough" "dry" ...
##  $ quantity_group       : chr  "enough" "insufficient" "enough" "dry" ...
##  $ source               : chr  "spring" "rainwater harvesting" "dam" "machine dbh" ...
##  $ source_type          : chr  "spring" "rainwater harvesting" "dam" "borehole" ...
##  $ source_class         : chr  "groundwater" "surface" "surface" "groundwater" ...
##  $ waterpoint_type      : chr  "communal standpipe" "communal standpipe" "communal standpipe multiple" "communal standpipe multiple" ...
##  $ waterpoint_type_group: chr  "communal standpipe" "communal standpipe" "communal standpipe" "communal standpipe" ...
##  $ flag                 : num  1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, ".internal.selfref")=<externalptr>

Distribución de la variable objetivo (balanceo) Se observa un poco de desbalance el cual no debería afectar demasiado

cuenta <- vlabels %>% count(status_group)
porcentaje <- round( prop.table(table(vlabels$status_group))*100, 2)
kable(cuenta, col.names = c('status_group', 'count'))
status_group count
functional 32259
functional needs repair 4317
non functional 22824
kable(porcentaje, col.names = c('status_group', '%'))
status_group %
functional 54.31
functional needs repair 7.27
non functional 38.42
barplot(porcentaje, col=rgb(0.2,0.4,0.6,0.6))

3 Exploratory data analysis EDA

# categorical plot
x <- inspect_cat(datos) 
show_plot(x)

# correlations in numeric columns
x <- inspect_cor(datos)
## Warning: Columns with 0 variance found: flag
show_plot(x)

# feature imbalance bar plot
x <- inspect_imb(datos)
show_plot(x)

# memory usage barplot
x <- inspect_mem(datos)
show_plot(x)

# missingness barplot
x <- inspect_na(datos)
show_plot(x)

# histograms for numeric columns
x <- inspect_num(datos)
show_plot(x)

# barplot of column types
x <- inspect_types(datos)
show_plot(x)

3.1 Observaciones EDA

  1. Una gran parte de las features son de tipo categoricas

  2. Se deben explorar los missing values (en gris) en el primer gráfico de frecuencias de las categóricas.

  3. wpt_name, subvillage, scheme_name e installer son las que ocupan mas espacio en memoria, si se revisan en el gráfico de frecuencias de las categóricas, son las que mas categorias contienen por cada una de las variables.

  4. public_meeting y permit muestran columnas con % de NA (5.6% y 5.1% respectivamente)

  5. La feature construction_year tiene valores en 0, es decir, no hay información codificada del año de construcción de la bomba de agua.

  6. Tambien se deben explorar las demas variables numéricas con 0 para determinar que hacer con ellas.

4 Feature engineering

Aqui se exploraran features numéricas y categóricas.

Se exploran algunas columnas como categoricas para construir el modelo:

No se incluyen todas las variables pues aunque podrían mejorar el modelo sería de un costo computacional alto y ademas varias contienen información muy similar entre unas y otras.

# Remueve atributos que no se usaran
names(datos)
##  [1] "id"                    "amount_tsh"            "date_recorded"        
##  [4] "funder"                "gps_height"            "installer"            
##  [7] "longitude"             "latitude"              "wpt_name"             
## [10] "num_private"           "basin"                 "subvillage"           
## [13] "region"                "region_code"           "district_code"        
## [16] "lga"                   "ward"                  "population"           
## [19] "public_meeting"        "recorded_by"           "scheme_management"    
## [22] "scheme_name"           "permit"                "construction_year"    
## [25] "extraction_type"       "extraction_type_group" "extraction_type_class"
## [28] "management"            "management_group"      "payment"              
## [31] "payment_type"          "water_quality"         "quality_group"        
## [34] "quantity"              "quantity_group"        "source"               
## [37] "source_type"           "source_class"          "waterpoint_type"      
## [40] "waterpoint_type_group" "flag"
#Quita las numericas que no han tenido gran incidencia en modelos anteriores
datos$amount_tsh <- NULL
datos$num_private <- NULL
datos$region_code <- NULL
datos$district_code <-NULL

#Quita algunas adicionales 
datos$date_recorded <- NULL
datos$recorded_by <- NULL
datos$permit <- NULL
datos$public_meeting <-NULL
datos$extraction_type_class <-NULL
datos$waterpoint_type_group <- NULL

Se inspecciona el nuevo dataset

x <- inspect_types(datos)
show_plot(x)

Revision de valores NA o ceros

NA,

x <- inspect_na(datos)
show_plot(x)

# Syntax
temp <- datos %>% mutate_at(vars(-c(flag)), ~na_if(., 0))
#
x <- inspect_na(temp)
show_plot(x)

Se convierten los na anteriores en ceros otra vez.

temp[is.na(temp)] <- 0
head(temp)

Se construyen algunas variables nuevas que pueden ser útiles a la hora de modelar. Se agregan con una fe_ para identificarlas.

La variable construction_year tiene un % importante de valores missing. Se crea una columna nueva con valores missing imputados segun la media.

temp$fe_construction_year<-round(ifelse(temp$construction_year==0, mean(temp$construction_year[temp$construction_year>0]),temp$construction_year), 0)
#Combinacion de latitud y longitud
temp$fe_lonlat  <- sqrt(temp$longitude^2 + temp$latitude^2)

Otra variable que calcula la antigüedad de la bomba basada en su fecha de construcción

year <- year(now())
temp$fe_antiguedad <- (year - temp$fe_construction_year)

Se elimina la variable construction_year

temp$construction_year <- NULL

La feature population tiene un % relevante de valores en 0

temp$fe_population<-round(ifelse(temp$population==0, mean(temp$population[temp$population>0]),temp$population), 0)

Se elimina la variable population

temp$population <- NULL

Se estudian las categóricas, hay features con un numero alto de categorías

# Categoricas
categoricas <- names(temp[, which(sapply(temp, is.character)), with = FALSE])

#-Frecuencias
freq_inicial <- apply(temp[, ..categoricas], 2, function(x) length(unique(x)))
freq_inicial
##                funder             installer              wpt_name 
##                  2141                  2411                 45684 
##                 basin            subvillage                region 
##                     9                 21426                    21 
##                   lga                  ward     scheme_management 
##                   125                  2098                    13 
##           scheme_name       extraction_type extraction_type_group 
##                  2869                    18                    13 
##            management      management_group               payment 
##                    12                     5                     7 
##          payment_type         water_quality         quality_group 
##                     7                     8                     6 
##              quantity        quantity_group                source 
##                     5                     5                    10 
##           source_type          source_class       waterpoint_type 
##                     7                     3                     7

Para estos casos se decide utilizar la técnica de sustituir cada categoría según su frecuencia de aparición

feature: funder

cfunder<-unique(temp[ , .(.N), by = .(funder)][order(-N)])
cfunder$perc<-cfunder$N/length(temp$funder)*100

temp[ , fe_funder := .N , by = .(funder)]

temp$funder <- NULL # elimina feature

feature: installer

cinstaller<-unique(temp[ , .(.N), by = .(installer)][order(-N)])
cinstaller$perc<-cinstaller$N/length(temp$installer)*100

temp[ ,fe_installer := .N , by = .(installer)]

temp$installer <- NULL # elimina feature

feature: wpt_name

cwpt<-unique(temp[ , .(.N), by = .(wpt_name)][order(-N)])
cwpt$perc<-cwpt$N/length(temp$wpt_name)*100

temp[ ,fe_wpt_name := .N , by = .(wpt_name)]

temp$wpt_name <- NULL # elimina feature

feature: basin

cbasin<-unique(temp[ , .(.N), by = .(basin)][order(-N)])
cbasin$perc<-cbasin$N/length(temp$basin)*100

temp[ ,fe_basin := .N , by = .(basin)]

temp$basin <- NULL # elimina feature

feature: subvillage

csubvillage<-unique(temp[ , .(.N), by = .(subvillage)][order(-N)])
csubvillage$perc<-csubvillage$N/length(temp$subvillage)*100

temp[ ,fe_subvillage := .N , by = .(subvillage)]

temp$subvillage <- NULL # elimina feature

feature: region

cregion<-unique(temp[ , .(.N), by = .(region)][order(-N)])
cregion$perc<-cregion$N/length(temp$region)*100

temp[ ,fe_region := .N , by = .(region)]

temp$region <- NULL # elimina feature

feature: lga

clga<-unique(temp[ , .(.N), by = .(lga)][order(-N)])
clga$perc<-clga$N/length(temp$lga)*100

temp[ ,fe_lga := .N , by = .(lga)]

temp$lga <- NULL # elimina feature

feature: ward

cward<-unique(temp[ , .(.N), by = .(ward)][order(-N)])
cward$perc<-cward$N/length(temp$ward)*100

temp[ ,fe_ward := .N , by = .(ward)]

temp$ward<- NULL # elimina feature

feature: scheme_management

cscheme<-unique(temp[ , .(.N), by = .(scheme_management)][order(-N)])
cscheme$perc<-cscheme$N/length(temp$scheme_management)*100

temp[ ,fe_scheme := .N , by = .(scheme_management)]

temp$scheme_management <- NULL # elimina feature

feature: scheme_name

cscheme_name<-unique(temp[ , .(.N), by = .(scheme_name)][order(-N)])
cscheme_name$perc<-cscheme_name$N/length(temp$scheme_name)*100

temp[ ,fe_scheme_name := .N , by = .(scheme_name)]

temp$scheme_name <- NULL # elimina feature

feature: extraction_type

cextraction_type<-unique(temp[ , .(.N), by = .(extraction_type)][order(-N)])
cextraction_type$perc<-cextraction_type$N/length(temp$extraction_type)*100

temp[ ,fe_extract_type := .N , by = .(extraction_type)]

temp$extraction_type <- NULL # elimina feature

feature: extraction_type_group

cextraction<-unique(temp[ , .(.N), by = .(extraction_type_group)][order(-N)])
cextraction$perc<-cextraction$N/length(temp$extraction_type_group)*100

temp[ ,fe_extract := .N , by = .(extraction_type_group)]

temp$extraction_type_group <- NULL # elimina feature

feature: management

cmanagement<-unique(temp[ , .(.N), by = .(management)][order(-N)])
cmanagement$perc<-cmanagement$N/length(temp$management)*100

temp[ ,fe_management := .N , by = .(management)]

temp$management <- NULL # elimina feature

feature: management_group

cmanagementg<-unique(temp[ , .(.N), by = .(management_group)][order(-N)])
cmanagementg$perc<-cmanagementg$N/length(temp$management_group)*100

temp[ ,fe_management_g := .N , by = .(management_group)]

temp$management_group <- NULL # elimina feature

feature: payment

cpayment<-unique(temp[ , .(.N), by = .(payment)][order(-N)])
cpayment$perc<-cpayment$N/length(temp$payment)*100

temp[ ,fe_payment := .N , by = .(payment)]

temp$payment <- NULL # elimina feature

feature: payment_type

cpayment_t<-unique(temp[ , .(.N), by = .(payment_type)][order(-N)])
cpayment_t$perc<-cpayment$N/length(temp$payment_type)*100

temp[ ,fe_payment := .N , by = .(payment_type)]

temp$payment_type <- NULL # elimina feature

feature: water_quality

cwater<-unique(temp[ , .(.N), by = .(water_quality)][order(-N)])
cwater$perc<-cwater$N/length(temp$cwater)*100

temp[ ,fe_water_quality := .N , by = .(water_quality)]

temp$water_quality <- NULL # elimina feature

feature: quality_group

cquality_group<-unique(temp[ , .(.N), by = .(quality_group)][order(-N)])
cquality_group$perc<-cquality_group$N/length(temp$quality_group)*100

temp[ ,fe_quality_group := .N , by = .(quality_group)]

temp$quality_group <- NULL # elimina feature

feature: quantity

cquantity<-unique(temp[ , .(.N), by = .(quantity)][order(-N)])
cquantity$perc<-cquantity$N/length(temp$quantity)*100

temp[ ,fe_quantity := .N , by = .(quantity)]

temp$quantity <- NULL # elimina feature

feature: quantity_group

cquantity_g<-unique(temp[ , .(.N), by = .(quantity_group)][order(-N)])
cquantity_g$perc<-cquantity_g$N/length(temp$quantity_group)*100

temp[ ,fe_quantity_group := .N , by = .(quantity_group)]

temp$quantity_group <- NULL # elimina feature

feature: source

csource<-unique(temp[ , .(.N), by = .(source)][order(-N)])
csource$perc<-csource$N/length(temp$source)*100

temp[ ,fe_source := .N , by = .(source)]

temp$source <- NULL # elimina feature

feature: source_type

csource_ct<-unique(temp[ , .(.N), by = .(source_type)][order(-N)])
csource_ct$perc<-csource_ct$N/length(temp$source_type)*100

temp[ ,fe_source_type := .N , by = .(source_type)]

temp$source_type <- NULL # elimina feature

feature: source_class

csource_c<-unique(temp[ , .(.N), by = .(source_class)][order(-N)])
csource_c$perc<-csource_c$N/length(temp$source_class)*100

temp[ ,fe_source_class := .N , by = .(source_class)]

temp$source_class <- NULL # elimina feature

feature: waterpoint_type

cwaterpoint<-unique(temp[ , .(.N), by = .(waterpoint_type)][order(-N)])
cwaterpoint$perc<-cwaterpoint$N/length(temp$waterpoint_type)*100

temp[ ,fe_waterpoint_type := .N , by = .(waterpoint_type)]

temp$waterpoint_type <- NULL # elimina feature

5 Pasos previos a modelizar

# Separa train y test segun su flag
trainset<-temp[temp$flag==1,]

#table(trainset$flag) comprobacion
testset<-temp[temp$flag==0,]

# Se combina el train set con la variable objetivo
trainset <- merge(trainset, vlabels, by ='id', sort = FALSE)

# Elimina la columna flag y la columna id
trainset$flag <- NULL
testset$flag <- NULL
trainset$id <-NULL

La columna status_group indica en palabras si es funcional o no. Se recodifica para simplificar

trainset = trainset %>%
  mutate(status_group = ifelse(status_group== "functional", 0,
                        ifelse(status_group == "functional needs repair",1 , 2)))
table(trainset$status_group)
## 
##     0     1     2 
## 32259  4317 22824

6 Construccion de modelo

6.1 Random forest con ranger

Tomando como base lo anterior y considerando las variables seleccionadas anteriormente se construye el tercer modelo

fit <- ranger(status_group ~. ,
               data = trainset,
                num.trees = 300,
                mtry=6,
                importance = 'impurity',
                write.forest = TRUE,
                min.node.size = 1,
                splitrule = "gini",
                verbose = TRUE,
                classification = TRUE,
               seed=1234
                )

Se despliegan los resultados

print(fit)
## Ranger result
## 
## Call:
##  ranger(status_group ~ ., data = trainset, num.trees = 300, mtry = 6,      importance = "impurity", write.forest = TRUE, min.node.size = 1,      splitrule = "gini", verbose = TRUE, classification = TRUE,      seed = 1234) 
## 
## Type:                             Classification 
## Number of trees:                  300 
## Sample size:                      59400 
## Number of independent variables:  30 
## Mtry:                             6 
## Target node size:                 1 
## Variable importance mode:         impurity 
## Splitrule:                        gini 
## OOB prediction error:             18.55 %

El modelo3 presenta una pequena mejora con respecto al 2

predictions_train <- predict(fit, data = trainset)
confusionMatrix(table( trainset$status_group, predictions_train$predictions))
## Confusion Matrix and Statistics
## 
##    
##         0     1     2
##   0 32174    35    50
##   1   262  4013    42
##   2   244    30 22550
## 
## Overall Statistics
##                                          
##                Accuracy : 0.9888         
##                  95% CI : (0.988, 0.9897)
##     No Information Rate : 0.5502         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.9797         
##                                          
##  Mcnemar's Test P-Value : < 2.2e-16      
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2
## Sensitivity            0.9845  0.98406   0.9959
## Specificity            0.9968  0.99450   0.9925
## Pos Pred Value         0.9974  0.92958   0.9880
## Neg Pred Value         0.9814  0.99882   0.9975
## Prevalence             0.5502  0.06865   0.3812
## Detection Rate         0.5416  0.06756   0.3796
## Detection Prevalence   0.5431  0.07268   0.3842
## Balanced Accuracy      0.9907  0.98928   0.9942

Se predice sobre los datos del concurso

predictions_concurso <- predict(fit, data = testset)
resultados_concurso<- as.data.frame(cbind( testset$id,predictions_concurso$predictions))
names(resultados_concurso)<-c("id", "status_group")
resultados_concurso$status_group<-ifelse(resultados_concurso$status_group==0,"functional",
                               ifelse(resultados_concurso$status_group==1,"functional needs repair","non functional"))

#Se guarda en el fichero que se subirá a la plataforma -->
fwrite(resultados_concurso, file = "results_model3.csv")

Variables importantes

vars_imp <- fit$variable.importance
vars_imp <- as.data.frame(vars_imp)
vars_imp$myvar <- rownames(vars_imp)
vars_imp <- as.data.table(vars_imp)
setorder(vars_imp, -vars_imp)

Plot de variables mas importantes

ggbarplot(vars_imp,
          x = "myvar", y = "vars_imp",
          #fill  = 'myvar',
          color = "blue",
          palette = "jco",
          sort.val = "asc",
          sort.by.groups = FALSE,
          x.text.angle = 90,
          ylab = "Importancia",
          xlab = 'Variable',
          #legend.title = "MPG Group",
          rotate = TRUE,
          ggtheme = theme_minimal()
          )

7 Conclusiones

  1. Las features con mas peso predictivo para este modelo son las que tienen que ver con localizacion (latitud, longitud), recategorizacion fe_quantity (cuanta cantidad de agua tiene la bomba) mismas features que aparecen liderando el peso predictivo en el modelo1 y 2) ademas de fe_waterpoint_type. Es decir las recategorizaciones de categoricas segun frecuencias, se vieron reflejadas en este modelo.

  2. Este tercer modelo tiene un scoring de 0.8213 lo cual es una mejora pequeña pero de todos modos relevante respecto al segundo modelo. A modo personal considero que el puntaje es bueno aunque mejorable pero quizás a costa de crear un modelo más grande y complicado si quisiera incluir todas las variables lo cual no creo que sea lo más eficiente. Además varias features contienen información similar